Find and analyse drought events#
import sys
import os
import glob
import xarray as xr
from functools import partial
import datetime
import numpy as np
import plotly.graph_objects as go
import dask.array as da
import pandas as pd
import matplotlib.pyplot as plt
from scipy.ndimage import label, generate_binary_structure
import geopandas as gpd
import pandas as pd
from scipy.ndimage import label, generate_binary_structure
import hvplot.xarray # to plot xarray with hvplot
import cartopy.crs as ccrs
Load data function#
def get_spi_dataset(acc_period: str = 1, years: list = [2020]):
data_root_folder = '/data1/drought_dataset/spi/'
spi_folder = os.path.join(data_root_folder, f'spi{acc_period}')
spi_paths = []
for year in years:
spi_paths.extend(sorted(glob.glob(
f'{data_root_folder}spi{acc_period}/SPI{acc_period}_gamma_global_era5_moda_ref1991to2020_{year}*.nc')))
return xr.open_mfdataset(spi_paths, chunks={'time': "auto"}, concat_dim="time", combine='nested', parallel=False)
def get_spei_dataset(acc_period: str = 1, years: list = [2020]):
data_root_folder = '/data1/drought_dataset/spei/'
spi_folder = os.path.join(data_root_folder, f'spi{acc_period}')
spi_paths = []
for year in years:
spi_paths.extend(sorted(glob.glob(
f'{data_root_folder}spei{acc_period}/SPEI{acc_period}_genlogistic_global_era5_moda_ref1991to2020_{year}*.nc')))
return xr.open_mfdataset(spi_paths, chunks={'time': "auto"}, concat_dim="time", combine='nested', parallel=False)
def mask_invalid_values(ds, variable, value=-9999):
ds[variable] = ds[variable].where(ds[variable] != value, np.nan)
return ds
def subset_region(dataset, variable, bbox):
# data = dataset.sel(time=np.datetime64(time), method='nearest')[variable]
# Define the geographical boundaries for Madagascar
lat_bounds = [bbox[1], bbox[3]] # from south to north
lon_bounds = [bbox[0], bbox[2]] # from west to east
# Check for NaN values in latitude and longitude coordinates
lat_nan = dataset['lat'].isnull().any()
lon_nan = dataset['lon'].isnull().any()
# Handle NaN values if they exist
if lat_nan:
dataset = dataset.dropna(dim='lat', how='all')
if lon_nan:
dataset = dataset.dropna(dim='lon', how='all')
# Ensure no NaN values in the data itself
dataset = dataset.fillna(np.nan) # or use another appropriate method like interpolation
# Ensure the lat/lon bounds are within the data's range
lat_min, lat_max = dataset['lat'].min().item(), dataset['lat'].max().item()
lon_min, lon_max = dataset['lon'].min().item(), dataset['lon'].max().item()
if lat_bounds[0] < lat_min or lat_bounds[1] > lat_max or lon_bounds[0] < lon_min or lon_bounds[1] > lon_max:
raise ValueError("The specified latitude/longitude bounds are outside the range of the dataset.")
# Subset the data using where and dropna
dataset = dataset.where(
(dataset['lat'] >= lat_bounds[0]) & (dataset['lat'] <= lat_bounds[1]) &
(dataset['lon'] >= lon_bounds[0]) & (dataset['lon'] <= lon_bounds[1]),
drop=True
)
# return xr.Dataset(data)
return dataset
def get_spei_significance_dataset(variable='SPEI1', year=2020):
data_root_folder='/data1/drought_dataset/spei/'
quality_paths = []
for month in range(1, 13):
month_str = f'{month:02d}'
quality_paths.append(f'{data_root_folder}{variable.lower()}/parameter/{variable}_significance_global_era5_moda_{year}{month_str}_ref1991to2020.nc')
return xr.open_mfdataset(quality_paths, concat_dim="time", combine='nested', parallel=False)
def get_spi_significance_dataset(variable='SPI1', year=2020):
data_root_folder='/data1/drought_dataset/spi/'
quality_paths = []
for month in range(1, 13):
month_str = f'{month:02d}'
quality_paths.append(f'{data_root_folder}{variable.lower()}/parameter/{variable}_significance_global_era5_moda_{year}{month_str}_ref1991to2020.nc')
return xr.open_mfdataset(quality_paths, concat_dim="time", combine='nested', parallel=False)
Load dataset#
# Load dataset
spei_data = get_spei_dataset(acc_period=12, years=list(range(1940, 2025)))
spei48_region = mask_invalid_values(spei_data, variable='SPEI12')
Filter dataset for specific bounding box#
# Get a subset of the dataset for a bbox
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world = world.to_crs(epsg=4326)
# country_list = world['name'].unique().tolist()
# country_list.sort()
# country_shape = world[world['name'] == 'Kenya']
country_shape = world[world['name'] == 'S. Sudan']
spei_data = spei_data.rio.write_crs("EPSG:4326", inplace=True)
spei_data_country = spei48_region.rio.clip(country_shape.geometry, world.crs, drop=True)
/tmp/ipykernel_2048911/1990148899.py:2: FutureWarning: The geopandas.dataset module is deprecated and will be removed in GeoPandas 1.0. You can get the original 'naturalearth_lowres' data from https://www.naturalearthdata.com/downloads/110m-cultural-vectors/.
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
spei = spei_data_country['SPEI12']
spei
<xarray.DataArray 'SPEI12' (time: 1009, lat: 34, lon: 45)> Size: 12MB
dask.array<getitem, shape=(1009, 34, 45), dtype=float64, chunksize=(1, 34, 45), chunktype=numpy.ndarray>
Coordinates:
* time (time) datetime64[ns] 8kB 1940-01-01T06:00:00 ... 2024-01-01...
* lon (lon) float64 360B 24.25 24.5 24.75 25.0 ... 34.75 35.0 35.25
* lat (lat) float64 272B 12.0 11.75 11.5 11.25 ... 4.5 4.25 4.0 3.75
spatial_ref int64 8B 0
Attributes:
long_name: Standardized Drought Index (SPEI12)
units: -spei.hvplot(
clim=(-8,8),
groupby="time",
widget_type="scrubber",
widget_location="bottom",
projection=ccrs.PlateCarree(),
coastline='10m',
cmap='BrBG',
features=['borders']
)
Analyse each month and find if there was a drought while at the same time classify the conditions for the whole region. E.g. there was a severe drought in a time point if for least a minimum number of grid points SPEI < -1.5#
np.count_nonzero(~np.isnan(spei[49]).values)
# spei[49]
818
Setup drought severity classification function and classes#
import xarray as xr
import numpy as np
def classify_drought_severity(spei, classes, conditions, threshold=50):
"""
Classifies drought severity based on SPEI values and counts grid points in each class.
Parameters:
- spei: An xarray DataArray containing SPEI values (dimensions: time, lat, lon).
- classes: A list of class names (e.g., ['Extreme Drought', 'Severe Drought', ...]).
- conditions: A list of conditions corresponding to each class.
- threshold: Minimum number of grid points required to classify a time step into a specific class.
Returns:
- result_df: A pandas DataFrame with counts of grid points in each class for each time step,
including a 'Final Classification' column.
"""
# Count the number of grid points in each condition for each time step
counts = [condition.sum(dim=['lat', 'lon']) for condition in conditions]
# Combine counts along a new dimension called 'class'
counts_concat = xr.concat(counts, dim=pd.Index(classes, name="class"))
# Convert to DataFrame
counts_df = counts_concat.to_dataframe(name='count').reset_index()
# Pivot the DataFrame to have classes as columns
result_df = counts_df.pivot(index='time', columns='class', values='count').fillna(0)
# Determine the final classification for each time step based on the threshold
def classify_row(row):
for class_name in classes:
if row[class_name] >= threshold:
return class_name
return 'No Data' # If no class meets the threshold
result_df['Final Classification'] = result_df.apply(classify_row, axis=1)
return result_df
# Example usage
# Load the dataset (assuming it's already in xarray format)
# ds = xr.open_dataset('your_dataset.nc') # Uncomment if loading from file
# spei = ds['SPEI'] # Replace 'SPEI' with your actual variable name
# Define the conditions and corresponding classes
conditions = [
spei < -2, # 'Extremely dry'
(spei >= -2) & (spei < -1.5), # 'Severely dry'
(spei >= -1.5) & (spei < -1), # 'Moderately dry'
(spei >= -1) & (spei < 0), # 'Mildly dry'
(spei >= 0) & (spei <= 1), # 'Mildly wet'
(spei >= 1) & (spei <= 1.5), # 'Moderately wet'
(spei >= 1.5) & (spei <= 2), # 'Severely wet'
spei > 2 # 'Extremely wet'
]
classes = ['Extremely Dry',
'Severely Dry',
'Moderately Dry',
'Mildly Dry',
'Mildly Wet',
'Moderately Wet',
'Severely Wet',
'Extremely Wet']
Classify months in spei#
# Get the result DataFrame
result_df = classify_drought_severity(spei, classes, conditions)
result_df = result_df.reset_index()
# Output the result
result_df
| class | time | Extremely Dry | Extremely Wet | Mildly Dry | Mildly Wet | Moderately Dry | Moderately Wet | Severely Dry | Severely Wet | Final Classification |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1940-01-01 06:00:00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No Data |
| 1 | 1940-02-01 06:00:00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No Data |
| 2 | 1940-03-01 06:00:00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No Data |
| 3 | 1940-04-01 06:00:00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No Data |
| 4 | 1940-05-01 06:00:00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No Data |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1004 | 2023-09-01 06:00:00 | 756 | 0 | 5 | 0 | 13 | 0 | 44 | 0 | Extremely Dry |
| 1005 | 2023-10-01 06:00:00 | 723 | 0 | 14 | 0 | 27 | 0 | 54 | 0 | Extremely Dry |
| 1006 | 2023-11-01 06:00:00 | 644 | 0 | 33 | 2 | 41 | 0 | 98 | 0 | Extremely Dry |
| 1007 | 2023-12-01 06:00:00 | 647 | 0 | 34 | 1 | 38 | 0 | 98 | 0 | Extremely Dry |
| 1008 | 2024-01-01 06:00:00 | 652 | 0 | 40 | 1 | 39 | 0 | 86 | 0 | Extremely Dry |
1009 rows × 10 columns
Generate barplot for the dataset to visuallize drought events
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.dates as mdates
# Map classifications to colors
class_color_map = {
'Extremely Dry': '#291700', # Red
'Severely Dry': '#8F5100', # Orange
'Moderately Dry': '#FF961F', # Yellow
'Mildly Dry': '#FFBF69', # Light Green
'Mildly Wet': '#A1F7D5', # Green
'Moderately Wet': '#43EFAA', # Dark Green
'Severely Wet': '#0D965F',
'Extremely Wet': '#074B30',
'No Data': '#cccccc' # Gray
}
# Map the classifications to colors
result_df['Color'] = result_df['Final Classification'].map(class_color_map)
# Create the plot
plt.figure(figsize=(12, 4)) # Adjust the width and height of the plot
# Plot bars
plt.bar(result_df['time'], 1, color=result_df['Color'], width=60, align='edge') # Adjust width for visibility
# Customize x-axis and y-axis
plt.gca().yaxis.set_visible(False) # Hide y-axis
# Set x-axis major locator and formatter to show only yearly ticks
plt.gca().xaxis.set_major_locator(mdates.YearLocator()) # Place ticks at yearly intervals
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y')) # Format x-axis labels to show only year
# Set x-axis limits
plt.xlim(pd.Timestamp(result_df.time.min()), pd.Timestamp(result_df.time.max()))
# Rotate x-axis labels for better readability
plt.xticks(rotation=90)
# Label the x-axis
plt.xlabel('Time')
# Set the title of the plot
plt.title('Drought Classification Over Time')
# Add legend
handles = [plt.Line2D([0], [0], color=color, lw=4) for color in class_color_map.values()]
labels = list(class_color_map.keys())
plt.legend(handles, labels, title='Drought Classification', bbox_to_anchor=(1.05, 1), loc='upper left')
# Adjust layout for better fit
plt.tight_layout()
# Show the plot
plt.show()
import plotly.graph_objs as go
import pandas as pd
import numpy as np
# Map classifications to colors
class_color_map = {
'Extremely Dry': '#291700', # Red
'Severely Dry': '#8F5100', # Orange
'Moderately Dry': '#FF961F', # Yellow
'Mildly Dry': '#FFBF69', # Light Green
'Mildly Wet': '#A1F7D5', # Green
'Moderately Wet': '#43EFAA', # Dark Green
'Severely Wet': '#0D965F',
'Extremely Wet': '#074B30',
'No Data': '#cccccc' # Gray
}
# Map the classifications to colors
result_df['Color'] = result_df['Final Classification'].map(class_color_map)
result_df['Label'] = result_df['time'].dt.strftime('%Y-%m')
# Create the plot
fig = go.Figure()
# Add bars
fig.add_trace(go.Bar(
x=result_df['Label'],
y=[1] * len(result_df), # All bars have height 1
marker=dict(color=result_df['Color'], line=result_df['Color']),
# width=1, # Adjust width for better visual appearance
name='Drought Classification',
opacity=1 # Adjust opacity to improve visibility
))
# Update x-axis and y-axis
fig.update_xaxes(
title_text='Time',
tickangle=90, # Rotate x-axis labels
type='category',
categoryorder='category ascending', # Ensure labels are in the order of the data
showgrid=False # Hide grid lines to reduce visual clutter
)
fig.update_yaxes(
visible=False # Hide y-axis
)
# Update layout for transparent background and legend
fig.update_layout(
title='Drought Classification Over Time',
legend_title='Drought Classification',
legend=dict(
x=1.05, # Positioning the legend to the right of the plot
y=1,
orientation='v',
traceorder='normal' # Ensure legend entries are in the order they appear in the plot
),
bargap=0,
margin=dict(l=50, r=200, t=50, b=50),
template='plotly', # Adjust margins for better fit
paper_bgcolor='white', # Transparent background for the entire paper
plot_bgcolor='white', # Transparent background for the plot area,
)
# Show the plot
fig.show()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[12], line 28
25 fig = go.Figure()
27 # Add bars
---> 28 fig.add_trace(go.Bar(
29 x=result_df['Label'],
30 y=[1] * len(result_df), # All bars have height 1
31 marker=dict(color=result_df['Color'], line=result_df['Color']),
32 # width=1, # Adjust width for better visual appearance
33 name='Drought Classification',
34 opacity=1 # Adjust opacity to improve visibility
35 ))
37 # Update x-axis and y-axis
38 fig.update_xaxes(
39 title_text='Time',
40 tickangle=90, # Rotate x-axis labels
(...)
43 showgrid=False # Hide grid lines to reduce visual clutter
44 )
File ~/drought-311-venv/lib64/python3.11/site-packages/plotly/graph_objs/_bar.py:3313, in Bar.__init__(self, arg, alignmentgroup, base, basesrc, cliponaxis, constraintext, customdata, customdatasrc, dx, dy, error_x, error_y, hoverinfo, hoverinfosrc, hoverlabel, hovertemplate, hovertemplatesrc, hovertext, hovertextsrc, ids, idssrc, insidetextanchor, insidetextfont, legend, legendgroup, legendgrouptitle, legendrank, legendwidth, marker, meta, metasrc, name, offset, offsetgroup, offsetsrc, opacity, orientation, outsidetextfont, selected, selectedpoints, showlegend, stream, text, textangle, textfont, textposition, textpositionsrc, textsrc, texttemplate, texttemplatesrc, uid, uirevision, unselected, visible, width, widthsrc, x, x0, xaxis, xcalendar, xhoverformat, xperiod, xperiod0, xperiodalignment, xsrc, y, y0, yaxis, ycalendar, yhoverformat, yperiod, yperiod0, yperiodalignment, ysrc, zorder, **kwargs)
3311 _v = marker if marker is not None else _v
3312 if _v is not None:
-> 3313 self["marker"] = _v
3314 _v = arg.pop("meta", None)
3315 _v = meta if meta is not None else _v
File ~/drought-311-venv/lib64/python3.11/site-packages/plotly/basedatatypes.py:4860, in BasePlotlyType.__setitem__(self, prop, value)
4858 # ### Handle compound property ###
4859 if isinstance(validator, CompoundValidator):
-> 4860 self._set_compound_prop(prop, value)
4862 # ### Handle compound array property ###
4863 elif isinstance(validator, (CompoundArrayValidator, BaseDataValidator)):
File ~/drought-311-venv/lib64/python3.11/site-packages/plotly/basedatatypes.py:5271, in BasePlotlyType._set_compound_prop(self, prop, val)
5268 # Import value
5269 # ------------
5270 validator = self._get_validator(prop)
-> 5271 val = validator.validate_coerce(val, skip_invalid=self._skip_invalid)
5273 # Save deep copies of current and new states
5274 # ------------------------------------------
5275 curr_val = self._compound_props.get(prop, None)
File ~/drought-311-venv/lib64/python3.11/site-packages/_plotly_utils/basevalidators.py:2511, in CompoundValidator.validate_coerce(self, v, skip_invalid, _validate)
2508 v = self.data_class()
2510 elif isinstance(v, dict):
-> 2511 v = self.data_class(v, skip_invalid=skip_invalid, _validate=_validate)
2513 elif isinstance(v, self.data_class):
2514 # Copy object
2515 v = self.data_class(v)
File ~/drought-311-venv/lib64/python3.11/site-packages/plotly/graph_objs/bar/_marker.py:1218, in Marker.__init__(self, arg, autocolorscale, cauto, cmax, cmid, cmin, color, coloraxis, colorbar, colorscale, colorsrc, cornerradius, line, opacity, opacitysrc, pattern, reversescale, showscale, **kwargs)
1216 _v = line if line is not None else _v
1217 if _v is not None:
-> 1218 self["line"] = _v
1219 _v = arg.pop("opacity", None)
1220 _v = opacity if opacity is not None else _v
File ~/drought-311-venv/lib64/python3.11/site-packages/plotly/basedatatypes.py:4860, in BasePlotlyType.__setitem__(self, prop, value)
4858 # ### Handle compound property ###
4859 if isinstance(validator, CompoundValidator):
-> 4860 self._set_compound_prop(prop, value)
4862 # ### Handle compound array property ###
4863 elif isinstance(validator, (CompoundArrayValidator, BaseDataValidator)):
File ~/drought-311-venv/lib64/python3.11/site-packages/plotly/basedatatypes.py:5271, in BasePlotlyType._set_compound_prop(self, prop, val)
5268 # Import value
5269 # ------------
5270 validator = self._get_validator(prop)
-> 5271 val = validator.validate_coerce(val, skip_invalid=self._skip_invalid)
5273 # Save deep copies of current and new states
5274 # ------------------------------------------
5275 curr_val = self._compound_props.get(prop, None)
File ~/drought-311-venv/lib64/python3.11/site-packages/_plotly_utils/basevalidators.py:2520, in CompoundValidator.validate_coerce(self, v, skip_invalid, _validate)
2518 v = self.data_class()
2519 else:
-> 2520 self.raise_invalid_val(v)
2522 v._plotly_name = self.plotly_name
2523 return v
File ~/drought-311-venv/lib64/python3.11/site-packages/_plotly_utils/basevalidators.py:296, in BaseValidator.raise_invalid_val(self, v, inds)
293 for i in inds:
294 name += "[" + str(i) + "]"
--> 296 raise ValueError(
297 """
298 Invalid value of type {typ} received for the '{name}' property of {pname}
299 Received value: {v}
300
301 {valid_clr_desc}""".format(
302 name=name,
303 pname=self.parent_name,
304 typ=type_str(v),
305 v=repr(v),
306 valid_clr_desc=self.description(),
307 )
308 )
ValueError:
Invalid value of type 'pandas.core.series.Series' received for the 'line' property of bar.marker
Received value: 0 #cccccc
1 #cccccc
2 #cccccc
3 #cccccc
4 #cccccc
...
1004 #291700
1005 #291700
1006 #291700
1007 #291700
1008 #291700
Name: Color, Length: 1009, dtype: object
The 'line' property is an instance of Line
that may be specified as:
- An instance of :class:`plotly.graph_objs.bar.marker.Line`
- A dict of string/value properties that will be passed
to the Line constructor
Supported dict properties:
autocolorscale
Determines whether the colorscale is a default
palette (`autocolorscale: true`) or the palette
determined by `marker.line.colorscale`. Has an
effect only if in `marker.line.color` is set to
a numerical array. In case `colorscale` is
unspecified or `autocolorscale` is true, the
default palette will be chosen according to
whether numbers in the `color` array are all
positive, all negative or mixed.
cauto
Determines whether or not the color domain is
computed with respect to the input data (here
in `marker.line.color`) or the bounds set in
`marker.line.cmin` and `marker.line.cmax` Has
an effect only if in `marker.line.color` is set
to a numerical array. Defaults to `false` when
`marker.line.cmin` and `marker.line.cmax` are
set by the user.
cmax
Sets the upper bound of the color domain. Has
an effect only if in `marker.line.color` is set
to a numerical array. Value should have the
same units as in `marker.line.color` and if
set, `marker.line.cmin` must be set as well.
cmid
Sets the mid-point of the color domain by
scaling `marker.line.cmin` and/or
`marker.line.cmax` to be equidistant to this
point. Has an effect only if in
`marker.line.color` is set to a numerical
array. Value should have the same units as in
`marker.line.color`. Has no effect when
`marker.line.cauto` is `false`.
cmin
Sets the lower bound of the color domain. Has
an effect only if in `marker.line.color` is set
to a numerical array. Value should have the
same units as in `marker.line.color` and if
set, `marker.line.cmax` must be set as well.
color
Sets the marker.line color. It accepts either a
specific color or an array of numbers that are
mapped to the colorscale relative to the max
and min values of the array or relative to
`marker.line.cmin` and `marker.line.cmax` if
set.
coloraxis
Sets a reference to a shared color axis.
References to these shared color axes are
"coloraxis", "coloraxis2", "coloraxis3", etc.
Settings for these shared color axes are set in
the layout, under `layout.coloraxis`,
`layout.coloraxis2`, etc. Note that multiple
color scales can be linked to the same color
axis.
colorscale
Sets the colorscale. Has an effect only if in
`marker.line.color` is set to a numerical
array. The colorscale must be an array
containing arrays mapping a normalized value to
an rgb, rgba, hex, hsl, hsv, or named color
string. At minimum, a mapping for the lowest
(0) and highest (1) values are required. For
example, `[[0, 'rgb(0,0,255)'], [1,
'rgb(255,0,0)']]`. To control the bounds of the
colorscale in color space, use
`marker.line.cmin` and `marker.line.cmax`.
Alternatively, `colorscale` may be a palette
name string of the following list: Blackbody,Bl
uered,Blues,Cividis,Earth,Electric,Greens,Greys
,Hot,Jet,Picnic,Portland,Rainbow,RdBu,Reds,Viri
dis,YlGnBu,YlOrRd.
colorsrc
Sets the source reference on Chart Studio Cloud
for `color`.
reversescale
Reverses the color mapping if true. Has an
effect only if in `marker.line.color` is set to
a numerical array. If true, `marker.line.cmin`
will correspond to the last color in the array
and `marker.line.cmax` will correspond to the
first color.
width
Sets the width (in px) of the lines bounding
the marker points.
widthsrc
Sets the source reference on Chart Studio Cloud
for `width`.
Setup function to detect continuous periods of a condition#
def detect_continuous_periods_with_dates(df, binary_col, date_col, min_sep=1):
"""
Detects continuous periods of 1s in a binary vector within a DataFrame and returns a new DataFrame
with the start date, end date, and duration of each period.
Parameters:
- df: Input DataFrame containing the binary vector and dates.
- binary_col: Column name for the binary vector (0s and 1s).
- date_col: Column name for the corresponding dates.
- min_sep: Minimum number of continuous 0s required to separate periods of 1s.
Returns:
- periods_df: A DataFrame with 'Start Date', 'End Date', and 'Duration' columns.
"""
# Ensure binary_col is binary (0s and 1s)
assert df[binary_col].isin([0, 1]).all(), "The binary column must contain only 0s and 1s."
# Detect transitions in the binary column
transitions = df[binary_col].diff().fillna(0)
# Find where the vector changes from 0 to 1 (start of 1s) and 1 to 0 (end of 1s)
start_ones = transitions == 1
end_ones = transitions == -1
# Get the indices of these transitions
start_indices = start_ones[start_ones].index
end_indices = end_ones[end_ones].index
# If the series starts with 1s, add a start at the beginning
if df[binary_col].iloc[0] == 1:
start_indices = pd.Index([df.index[0]]).append(start_indices)
# If the series ends with 1s, add an end at the end
if df[binary_col].iloc[-1] == 1:
end_indices = end_indices.append(pd.Index([df.index[-1]]))
# Ensure indices are aligned
assert len(start_indices) == len(end_indices), "Mismatched start and end periods."
# Filter out periods that are too close to each other based on min_sep
valid_periods = []
last_end = -min_sep - 1 # Initialize last_end to be far enough back
for start, end in zip(start_indices, end_indices):
if start - last_end >= min_sep:
valid_periods.append((start, end))
last_end = end
# Create a new DataFrame for the detected periods
periods = []
for start, end in valid_periods:
start_date = df.loc[start, date_col]
end_date = df.loc[end, date_col]
duration = (end_date.year - start_date.year) * 12 + end_date.month - start_date.month + 1 # Duration in months
periods.append({'Start Date': start_date, 'End Date': end_date, 'Duration': duration})
periods_df = pd.DataFrame(periods)
return periods_df
Convert the timeline to a binary vector.#
Every dry condition is marked as drought and everything else as no drought. A minimum separation of 2 months with no drought is regarded as no change.
min_sep = 2 # Minimum separation of 2 zeros to consider periods distinct
result_df['class'] = np.where((result_df['Final Classification']=='Extremely Dry')|
(result_df['Final Classification']=='Severely Dry')|
(result_df['Final Classification']=='Moderately Dry')|
(result_df['Final Classification']=='Mildly Dry'), 1, 0)
Find the continuous periods and calculate their duration#
periods_df = detect_continuous_periods_with_dates(result_df, binary_col='class', date_col='time', min_sep=min_sep)
periods_df
| Start Date | End Date | Duration | |
|---|---|---|---|
| 0 | 1941-04-01 06:00:00 | 1944-05-01 06:00:00 | 38 |
| 1 | 1944-08-01 06:00:00 | 1946-10-01 06:00:00 | 27 |
| 2 | 1963-09-01 06:00:00 | 1963-11-01 06:00:00 | 3 |
| 3 | 1964-07-01 06:00:00 | 1964-08-01 06:00:00 | 2 |
| 4 | 1965-08-01 06:00:00 | 1967-05-01 06:00:00 | 22 |
| 5 | 1967-12-01 06:00:00 | 1968-08-01 06:00:00 | 9 |
| 6 | 1984-10-01 06:00:00 | 1985-03-01 06:00:00 | 6 |
| 7 | 1988-04-01 06:00:00 | 1988-07-01 06:00:00 | 4 |
| 8 | 1990-05-01 06:00:00 | 1991-04-01 06:00:00 | 12 |
| 9 | 1993-10-01 06:00:00 | 1994-09-01 06:00:00 | 12 |
| 10 | 1997-08-01 06:00:00 | 1998-02-01 06:00:00 | 7 |
| 11 | 1998-11-01 06:00:00 | 1999-10-01 06:00:00 | 12 |
| 12 | 2000-04-01 06:00:00 | 2001-11-01 06:00:00 | 20 |
| 13 | 2002-02-01 06:00:00 | 2002-03-01 06:00:00 | 2 |
| 14 | 2002-05-01 06:00:00 | 2024-01-01 06:00:00 | 261 |
Plot all the event durations and find the 75% percentile to find drought events with an anomalous duration#
def plot_duration_bar_plot(data, percentile=75):
percentile_9_duration = np.percentile(data.Duration, 90)
percentile_1_duration = np.percentile(data.Duration, 10)
median_duration = data.Duration.median()
# Create the plot
plt.figure(figsize=(10, 6))
# Create bars for each event
bars = plt.bar(data.index, data['Duration'], color='skyblue', edgecolor='black')
# Add a dashed red line for the average duration
plt.axhline(y=percentile_9_duration, color='red', linestyle='--', linewidth=2, label=f'{90} percentile of durations: {percentile_9_duration:.2f} months')
plt.axhline(y=percentile_1_duration, color='green', linestyle='--', linewidth=2, label=f'{10} percentile of durations: {percentile_1_duration:.2f} months')
plt.axhline(y=median_duration, color='blue', linestyle='--', linewidth=2, label=f'Median duration: {median_duration:.2f} months')
# Labeling the x-axis ticks with the start and end dates
xticks_labels = [f"{start.strftime('%Y-%m')} - {end.strftime('%Y-%m')}" for start, end in zip(data['Start Date'], data['End Date'])]
plt.xticks(ticks=np.arange(len(data.index)), labels=xticks_labels)
# Label axes
plt.xlabel('Events')
plt.ylabel('Duration (Months)')
plt.title('Event Durations with Start and End Dates')
# Add legend
plt.legend()
# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')
# Adjust layout for better fit
plt.tight_layout()
# Show the plot
plt.show()
plot_duration_bar_plot(periods_df)
def plot_duration_bar_plot(data, percentile=75):
percentile_9_duration = np.percentile(data.Duration, 90)
percentile_1_duration = np.percentile(data.Duration, 10)
median_duration = data.Duration.median()
# Generate x-axis labels based on the dates
x_labels = [f"{start.strftime('%Y-%m')} - {end.strftime('%Y-%m')}" for start, end in zip(data['Start Date'], data['End Date'])]
# Create a numerical x-axis for the plot
x_numeric = list(range(len(x_labels)))
# Create bars for each event
bar = go.Bar(
x=x_numeric,
y=data['Duration'],
marker=dict(color='skyblue', line=dict(color='black', width=1)),
name='Duration',
# text=x_labels,
# textposition='auto'
)
# Define the x-axis range for the lines
line_x_values = [x_numeric[0] - 1, x_numeric[-1] + 1] # Extend beyond the first and last data point
# Create lines for percentiles and median
percentile_9_line = go.Scatter(
x=line_x_values,
y=[percentile_9_duration, percentile_9_duration],
mode='lines',
line=dict(color='red', dash='dash'),
name=f'90th percentile: {percentile_9_duration:.2f} months'
)
percentile_1_line = go.Scatter(
x=line_x_values,
y=[percentile_1_duration, percentile_1_duration],
mode='lines',
line=dict(color='green', dash='dash'),
name=f'10th percentile: {percentile_1_duration:.2f} months'
)
median_line = go.Scatter(
x=line_x_values,
y=[median_duration, median_duration],
mode='lines',
line=dict(color='blue', dash='dash'),
name=f'Median: {median_duration:.2f} months'
)
# Create the layout
layout = go.Layout(
title='Event Durations with Start and End Dates',
xaxis=dict(
title='Events',
tickangle=-45,
tickmode='array',
tickvals=x_numeric,
ticktext=x_labels,
range=[x_numeric[0] - 1, x_numeric[-1] + 1], # Extend x-axis range
),
yaxis=dict(title='Duration (Months)'),
barmode='group',
legend=dict(x=1., y=0.5, orientation='v'),
margin=dict(l=50, r=50, t=50, b=100),
paper_bgcolor='white', # Transparent background for the entire paper
plot_bgcolor='white'
)
# Create the figure and add the traces
fig = go.Figure(data=[bar, percentile_9_line, percentile_1_line, median_line], layout=layout)
# Show the plot
fig.show()
plot_duration_bar_plot(periods_df)
Calculate area percentage for each class for each month and aggregate for each event#
def calculate_area_percentage(monthly_data, periods):
columns_to_use = ['Extremely Dry',
'Severely Dry',
'Moderately Dry',
'Mildly Dry',
'Mildly Wet',
'Moderately Wet',
'Severely Wet',
'Extremely Wet']
new_columns = ['Extremely Dry %',
'Severely Dry %',
'Moderately Dry %',
'Mildly Dry %',
'Mildly Wet %',
'Moderately Wet %',
'Severely Wet %',
'Extremely Wet %']
rows = []
for i, row in periods.iterrows():
start_date = row['Start Date']
end_date = row['End Date']
df = monthly_data.loc[(monthly_data.time >= start_date) & (monthly_data.time <= end_date)]
total = df[columns_to_use].sum(axis=1)
# Calculate the percentage for each specified column
df_percentage = df[columns_to_use].div(total, axis=0) * 100
cols = {i[0]:i[1] for i in zip(columns_to_use, new_columns)}
df_percentage.rename(columns=cols,inplace=-True)
# Add the percentage columns back to the original dataframe, if needed
df.loc[:, new_columns] = df_percentage
rows.append(df[new_columns].mean(axis=0))
new_df = pd.concat(rows, axis=1).T.reset_index(drop=True)
new_df['Start Date'] = periods['Start Date']
new_df['End Date'] = periods['End Date']
return new_df
percentages = calculate_area_percentage(result_df, periods_df)
percentages
| class | Extremely Dry % | Severely Dry % | Moderately Dry % | Mildly Dry % | Mildly Wet % | Moderately Wet % | Severely Wet % | Extremely Wet % | Start Date | End Date |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.209626 | 2.843907 | 6.144640 | 26.647150 | 38.138592 | 7.933342 | 6.157509 | 10.925235 | 1941-04-01 06:00:00 | 1944-05-01 06:00:00 |
| 1 | 1.616409 | 2.598931 | 5.030336 | 31.694286 | 48.691479 | 5.637055 | 2.354433 | 2.377071 | 1944-08-01 06:00:00 | 1946-10-01 06:00:00 |
| 2 | 0.040750 | 0.000000 | 0.407498 | 7.538712 | 29.951100 | 17.807661 | 15.729421 | 28.524857 | 1963-09-01 06:00:00 | 1963-11-01 06:00:00 |
| 3 | 3.361858 | 1.222494 | 1.589242 | 5.867971 | 47.188264 | 32.273839 | 7.212714 | 1.283619 | 1964-07-01 06:00:00 | 1964-08-01 06:00:00 |
| 4 | 3.778617 | 1.828184 | 3.217382 | 26.605912 | 44.654368 | 12.736164 | 5.140031 | 2.039342 | 1965-08-01 06:00:00 | 1967-05-01 06:00:00 |
| 5 | 2.811736 | 1.725075 | 3.558816 | 11.735941 | 24.789459 | 16.503667 | 13.922847 | 24.952459 | 1967-12-01 06:00:00 | 1968-08-01 06:00:00 |
| 6 | 0.081500 | 0.651997 | 1.976365 | 9.881826 | 50.040750 | 31.744091 | 4.095355 | 1.528117 | 1984-10-01 06:00:00 | 1985-03-01 06:00:00 |
| 7 | 0.061125 | 0.000000 | 0.000000 | 10.849633 | 77.567237 | 8.863081 | 2.353301 | 0.305623 | 1988-04-01 06:00:00 | 1988-07-01 06:00:00 |
| 8 | 0.061125 | 0.000000 | 0.478810 | 14.343928 | 73.471883 | 10.177262 | 1.375306 | 0.091687 | 1990-05-01 06:00:00 | 1991-04-01 06:00:00 |
| 9 | 0.937245 | 2.006927 | 3.382233 | 17.817848 | 58.842706 | 16.554605 | 0.458435 | 0.000000 | 1993-10-01 06:00:00 | 1994-09-01 06:00:00 |
| 10 | 0.174642 | 0.157178 | 1.432064 | 14.477820 | 55.483758 | 27.209221 | 1.065316 | 0.000000 | 1997-08-01 06:00:00 | 1998-02-01 06:00:00 |
| 11 | 0.081500 | 0.804808 | 3.636919 | 10.951508 | 31.081907 | 48.268134 | 5.175224 | 0.000000 | 1998-11-01 06:00:00 | 1999-10-01 06:00:00 |
| 12 | 0.201711 | 0.482885 | 1.167482 | 17.732274 | 71.479218 | 8.655257 | 0.281174 | 0.000000 | 2000-04-01 06:00:00 | 2001-11-01 06:00:00 |
| 13 | 0.000000 | 0.000000 | 0.000000 | 4.400978 | 83.007335 | 12.530562 | 0.061125 | 0.000000 | 2002-02-01 06:00:00 | 2002-03-01 06:00:00 |
| 14 | 6.823015 | 12.851643 | 16.260574 | 41.335282 | 20.840476 | 1.189707 | 0.494150 | 0.205154 | 2002-05-01 06:00:00 | 2024-01-01 06:00:00 |
percentages['Dry'] = percentages.loc[:, ['Extremely Dry %', 'Severely Dry %', 'Moderately Dry %', 'Mildly Dry %']].sum(axis=1)
# new_order = ['time',
# 'Extremely Dry',
# 'Extremely Dry %',
# 'Severely Dry',
# 'Severely Dry %',
# 'Moderately Dry',
# 'Moderately Dry %',
# 'Mildly Dry',
# 'Mildly Dry %',
# 'Mildly Wet',
# 'Mildly Wet %',
# 'Moderately Wet',
# 'Moderately Wet %',
# 'Severely Wet',
# 'Severely Wet %',
# 'Extremely Wet',
# 'Extremely Wet %',
# 'Color',
# 'class']
# df1 = df[new_order]
percentages
| class | Extremely Dry % | Severely Dry % | Moderately Dry % | Mildly Dry % | Mildly Wet % | Moderately Wet % | Severely Wet % | Extremely Wet % | Start Date | End Date | Dry |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.209626 | 2.843907 | 6.144640 | 26.647150 | 38.138592 | 7.933342 | 6.157509 | 10.925235 | 1941-04-01 06:00:00 | 1944-05-01 06:00:00 | 36.845322 |
| 1 | 1.616409 | 2.598931 | 5.030336 | 31.694286 | 48.691479 | 5.637055 | 2.354433 | 2.377071 | 1944-08-01 06:00:00 | 1946-10-01 06:00:00 | 40.939962 |
| 2 | 0.040750 | 0.000000 | 0.407498 | 7.538712 | 29.951100 | 17.807661 | 15.729421 | 28.524857 | 1963-09-01 06:00:00 | 1963-11-01 06:00:00 | 7.986960 |
| 3 | 3.361858 | 1.222494 | 1.589242 | 5.867971 | 47.188264 | 32.273839 | 7.212714 | 1.283619 | 1964-07-01 06:00:00 | 1964-08-01 06:00:00 | 12.041565 |
| 4 | 3.778617 | 1.828184 | 3.217382 | 26.605912 | 44.654368 | 12.736164 | 5.140031 | 2.039342 | 1965-08-01 06:00:00 | 1967-05-01 06:00:00 | 35.430096 |
| 5 | 2.811736 | 1.725075 | 3.558816 | 11.735941 | 24.789459 | 16.503667 | 13.922847 | 24.952459 | 1967-12-01 06:00:00 | 1968-08-01 06:00:00 | 19.831568 |
| 6 | 0.081500 | 0.651997 | 1.976365 | 9.881826 | 50.040750 | 31.744091 | 4.095355 | 1.528117 | 1984-10-01 06:00:00 | 1985-03-01 06:00:00 | 12.591687 |
| 7 | 0.061125 | 0.000000 | 0.000000 | 10.849633 | 77.567237 | 8.863081 | 2.353301 | 0.305623 | 1988-04-01 06:00:00 | 1988-07-01 06:00:00 | 10.910758 |
| 8 | 0.061125 | 0.000000 | 0.478810 | 14.343928 | 73.471883 | 10.177262 | 1.375306 | 0.091687 | 1990-05-01 06:00:00 | 1991-04-01 06:00:00 | 14.883863 |
| 9 | 0.937245 | 2.006927 | 3.382233 | 17.817848 | 58.842706 | 16.554605 | 0.458435 | 0.000000 | 1993-10-01 06:00:00 | 1994-09-01 06:00:00 | 24.144254 |
| 10 | 0.174642 | 0.157178 | 1.432064 | 14.477820 | 55.483758 | 27.209221 | 1.065316 | 0.000000 | 1997-08-01 06:00:00 | 1998-02-01 06:00:00 | 16.241705 |
| 11 | 0.081500 | 0.804808 | 3.636919 | 10.951508 | 31.081907 | 48.268134 | 5.175224 | 0.000000 | 1998-11-01 06:00:00 | 1999-10-01 06:00:00 | 15.474735 |
| 12 | 0.201711 | 0.482885 | 1.167482 | 17.732274 | 71.479218 | 8.655257 | 0.281174 | 0.000000 | 2000-04-01 06:00:00 | 2001-11-01 06:00:00 | 19.584352 |
| 13 | 0.000000 | 0.000000 | 0.000000 | 4.400978 | 83.007335 | 12.530562 | 0.061125 | 0.000000 | 2002-02-01 06:00:00 | 2002-03-01 06:00:00 | 4.400978 |
| 14 | 6.823015 | 12.851643 | 16.260574 | 41.335282 | 20.840476 | 1.189707 | 0.494150 | 0.205154 | 2002-05-01 06:00:00 | 2024-01-01 06:00:00 | 77.270513 |
def plot_area_bar_plot(data, columns_to_sum=['Moderately Dry %',
'Mildly Dry %',
'Mildly Wet %',
'Moderately Wet %',
'Severely Wet %',
'Extremely Wet %']):
columns = [i for i in data.columns if '%' in i and i not in columns_to_sum]
fig = go.Figure()
x_axis_labels = [f"{start.strftime('%Y-%m')} - {end.strftime('%Y-%m')}" for start, end in zip(data['Start Date'], data['End Date'])]
# Adding bars for each category
if columns_to_sum:
fig.add_trace(go.Bar(
x=x_axis_labels,
y=data[columns_to_sum].sum(axis=1),
name='Normal',
marker=dict(color=class_color_map['Severely Wet'])
))
for category in columns[::-1]:
fig.add_trace(go.Bar(
x=x_axis_labels,
y=data[category],
name=category[:-2],
marker=dict(color=class_color_map[category[:-2]])
))
# Updating the layout for stacked bar
fig.update_layout(
barmode='stack', # This ensures the bars are stacked
title='Area of each type of drought',
xaxis=dict(title='Events',
tickangle=-45, # Rotate the x-axis labels by -45 degrees
tickmode='array',
tickvals=x_axis_labels,
ticktext=x_axis_labels,),
yaxis=dict(title='Percentage'),
legend=dict(orientation='v',x=1, y=0.5),
margin=dict(l=50, r=50, t=50, b=100),
paper_bgcolor='rgba(0,0,0,0)', # Transparent background for the entire paper
plot_bgcolor='rgba(0,0,0,0)'
)
# Show the plot
fig.show()
plot_area_bar_plot(percentages, columns_to_sum=[])
plot_area_bar_plot(percentages)